{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "clip_onnx_example.ipynb",
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/Lednik7/CLIP-ONNX/blob/main/examples/clip_onnx_example.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Restart colab session after installation\n",
        "Reload the session if something doesn't work"
      ],
      "metadata": {
        "id": "fxPg_VvZuScV"
      }
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "al_QNjyFq6Jj"
      },
      "outputs": [],
      "source": [
        "%%capture\n",
        "!pip install git+https://github.com/Lednik7/CLIP-ONNX.git\n",
        "!pip install git+https://github.com/openai/CLIP.git\n",
        "!pip install onnxruntime-gpu"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%%capture\n",
        "!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true"
      ],
      "metadata": {
        "id": "42eeJz9lTdJ6"
      },
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!nvidia-smi"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "XuauIZIBSEUX",
        "outputId": "2c7c2bd9-90dd-4b1a-e98a-79e1f2218644"
      },
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Thu Jan  6 16:36:44 2022       \n",
            "+-----------------------------------------------------------------------------+\n",
            "| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |\n",
            "|-------------------------------+----------------------+----------------------+\n",
            "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
            "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
            "|                               |                      |               MIG M. |\n",
            "|===============================+======================+======================|\n",
            "|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |\n",
            "| N/A   35C    P8    26W / 149W |      0MiB / 11441MiB |      0%      Default |\n",
            "|                               |                      |                  N/A |\n",
            "+-------------------------------+----------------------+----------------------+\n",
            "                                                                               \n",
            "+-----------------------------------------------------------------------------+\n",
            "| Processes:                                                                  |\n",
            "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
            "|        ID   ID                                                   Usage      |\n",
            "|=============================================================================|\n",
            "|  No running processes found                                                 |\n",
            "+-----------------------------------------------------------------------------+\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import onnxruntime\n",
        "print(onnxruntime.get_device())"
      ],
      "metadata": {
        "id": "gqvxpdajRX5_"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## CPU inference mode"
      ],
      "metadata": {
        "id": "010k-ksVTjAu"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Torch CLIP"
      ],
      "metadata": {
        "id": "KdTz0IJWVBqE"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import clip\n",
        "from PIL import Image\n",
        "import numpy as np\n",
        "\n",
        "# onnx cannot work with cuda\n",
        "model, preprocess = clip.load(\"ViT-B/32\", device=\"cpu\", jit=False)\n",
        "\n",
        "# batch first\n",
        "image = preprocess(Image.open(\"CLIP.png\")).unsqueeze(0).cpu() # [1, 3, 224, 224]\n",
        "image_onnx = image.detach().cpu().numpy().astype(np.float32)\n",
        "\n",
        "# batch first\n",
        "text = clip.tokenize([\"a diagram\", \"a dog\", \"a cat\"]).cpu() # [3, 77]\n",
        "text_onnx = text.detach().cpu().numpy().astype(np.int64)"
      ],
      "metadata": {
        "id": "9ROPwKYurOhP"
      },
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "%timeit model(image, text)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1CrHQ8cYt8Cx",
        "outputId": "4d98f85d-4b02-4ae2-b18f-fb3c7a2d6caf"
      },
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "1 loop, best of 5: 636 ms per loop\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### CLIP-ONNX"
      ],
      "metadata": {
        "id": "Ao2MriaVVG6Y"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from clip_onnx import clip_onnx, attention\n",
        "clip.model.ResidualAttentionBlock.attention = attention\n",
        "\n",
        "onnx_model = clip_onnx(model)\n",
        "onnx_model.convert2onnx(image, text, verbose=True)\n",
        "# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']\n",
        "onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # cpu mode"
      ],
      "metadata": {
        "id": "nSeG9uAZrcph",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "8c394684-d78e-49f6-a60f-872485d5f650"
      },
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[CLIP ONNX] Start convert visual model\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/clip_onnx/utils.py:40: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').\n",
            "  head_dim = q.shape[2] // num_heads\n",
            "/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_helper.py:716: UserWarning: allowzero=0 by default. In order to honor zero value in shape use allowzero=1\n",
            "  warnings.warn(\"allowzero=0 by default. In order to honor zero value in shape use allowzero=1\")\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[CLIP ONNX] Start check visual model\n",
            "[CLIP ONNX] Start convert textual model\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_opset9.py:2819: UserWarning: Exporting aten::index operator of advanced indexing in opset 14 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.\n",
            "  \"If indices include negative values, the exported graph will produce incorrect results.\")\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[CLIP ONNX] Start check textual model\n",
            "[CLIP ONNX] Models converts successfully\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%timeit onnx_model(image_onnx, text_onnx)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "B15dr51UrvMh",
        "outputId": "7c5fbc64-61f5-4742-d5a1-24d123971515"
      },
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "1 loop, best of 5: 550 ms per loop\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## GPU inference mode\n",
        "Select a runtime GPU to continue:\n",
        "\n",
        "Click Runtime -> Change Runtime Type -> switch \"Harware accelerator\" to be GPU. Save it, and you maybe connect to GPU"
      ],
      "metadata": {
        "id": "Ahh_7CkTUb8y"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "### CLIP-ONNX"
      ],
      "metadata": {
        "id": "B6M7yq7qceb5"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "onnx_model.start_sessions(providers=[\"CUDAExecutionProvider\"]) # GPU mode"
      ],
      "metadata": {
        "id": "6LtPSZhfUd_m"
      },
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "onnx_model.visual_session.get_providers() # optional"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "xE0VGt9sQwrf",
        "outputId": "6feb4701-7b7f-437e-dc2f-c95c504dbb89"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "['CUDAExecutionProvider', 'CPUExecutionProvider']"
            ]
          },
          "metadata": {},
          "execution_count": 7
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%timeit onnx_model(image_onnx, text_onnx)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "iPUVzqmgcYas",
        "outputId": "3e7c1526-6e38-4982-ca36-eabfc95c2ab9"
      },
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "The slowest run took 79.70 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
            "1 loop, best of 5: 60.8 ms per loop\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Torch CLIP"
      ],
      "metadata": {
        "id": "jb58mrkbch2V"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import clip\n",
        "from PIL import Image\n",
        "\n",
        "device = \"cuda\"\n",
        "# onnx cannot work with cuda\n",
        "model, preprocess = clip.load(\"ViT-B/32\", device=device, jit=False)\n",
        "# batch first\n",
        "image = preprocess(Image.open(\"CLIP.png\")).unsqueeze(0).to(device) # [1, 3, 224, 224]\n",
        "text = clip.tokenize([\"a diagram\", \"a dog\", \"a cat\"]).to(device) # [3, 77]"
      ],
      "metadata": {
        "id": "gidR99GOckyF"
      },
      "execution_count": 10,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "%timeit model(image, text)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "XpBrtjlOcwOC",
        "outputId": "56375401-18a0-499b-f29b-c6e2d4d07e42"
      },
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "10 loops, best of 5: 72.2 ms per loop\n"
          ]
        }
      ]
    }
  ]
}